We are working with a dataset from a czech bank.
Below we have loaded all the packages and data as well as organized and sturctured it so we can work with it. loading all packages and data
library(tidyverse)
library(tidymodels)
library(ggmosaic)
library(ggalluvial)
library(rpart)
library(rpart.plot)
library(gridExtra)
library(usethis)
library(ggplot2)
library(dplyr)
account <- read.csv('./account.csv', sep = ";")
card <- read.csv('./card.csv', sep = ";")
client <- read.csv('./client.csv', sep = ";")
disp <- read.csv('./disp.csv', sep = ";")
district <- read.csv('./district.csv', sep = ';')
loan <- read.csv('./loan.csv', sep = ";")
order <- read.csv('./order.csv', sep = ";")
trans <- read.csv('./trans.csv', sep = ";")
clean/organize account
#reformat date
account$date_ymd <- format(as.Date(as.character(account$date), "%y%m%d"), "19%y/%m/%d")
account <- account %>%
select(-date)
#factorize variables
account <- account %>%
mutate (frequency = as_factor(frequency),
date_ymd = as.Date(date_ymd)
) %>%
mutate(frequency = case_when(
frequency == 'POPLATEK MESICNE' ~ 'monthly issuance',
frequency == 'POPLATEK TYDNE' ~ 'weekly issuance',
frequency == 'POPLATEK PO OBRATU' ~ 'issuance after transaction'
))
clean/organize card
#reformat date
card$issued_date <- format(as.Date(as.character(card$issued), "%y%m%d"), "19%y/%m/%d")
#factorize variables
card <- card %>%
mutate(type = as_factor(type),
issued_date = as.Date(issued_date)
)
clean/organize client (Dani fragen)
#reformat birth_number
client <- client %>%
mutate(month = substr(birth_number, 3, 4)) %>%
mutate(month = strtoi(month))
client <- client %>%
mutate(gender = case_when(
month > 12 ~ 'f',
TRUE ~ 'm')
)
client <- client %>%
mutate(birth_number = as.numeric(birth_number))
client <- client %>%
mutate(birth_number = case_when(
month > 12 ~ birth_number - 5000,
TRUE ~ birth_number)
)
client$birthday <- format(as.Date(as.character(client$birth_number), "%y%m%d"), "19%y/%m/%d")
#factorize variables
client <- client %>%
mutate(district_id = as_factor(district_id),
gender = as_factor(gender),
birthday = as.Date(birthday)
)
project_date = '1998/1/1'
library(eeptools)
x <- as.Date(c("1998-01-01"))
client$age <- floor(age_calc(as.Date(client$birthday),x, units = "years"))
library(eeptools)
x <- as.Date(c("1998-01-01"))
client$age <- floor(age_calc(as.Date(client$birthday),x, units = "years"))
birthday proper shape
client <- client %>%
mutate(
birth_number = birth_number + 19000000
)
clean/organize disp
#factorize variables
disp <- disp %>%
mutate(type = as_factor(type)
)
clean/organize district
#rename coluns
names(district) <- c('district_id', 'district_name', 'region', 'habitants', 'municipalities<499',
'municipalities<500-1000', 'municipalities<2000-9999', 'municipalities>10000', 'cities',
'ratio_urban_inhabitants', 'average_salary', 'unemployment_rate_1995', 'unemployment_rate_1996',
'enterpreneurs_per_1000', 'crimes_1995', 'crimes_1996')
#factorize variables
district <- district %>%
mutate(district_name = as_factor(district_name), region = as_factor(region))
#change datatype to numeric/int
district <- district %>%
mutate(unemployment_rate_1995 = as.numeric(unemployment_rate_1995),
enterpreneurs_per_1000 = strtoi(enterpreneurs_per_1000),
crimes_1995 = strtoi(crimes_1995)
)
Warning: Problem with `mutate()` column `unemployment_rate_1995`.
ℹ `unemployment_rate_1995 = as.numeric(unemployment_rate_1995)`.
ℹ NAs introduced by coercion
#str(district)
clean/organize loan
#reformat date
loan$date_issued <- format(as.Date(as.character(loan$date), "%y%m%d"), "19%y/%m/%d")
loan <- loan %>%
select(-date)
#factorize variables
loan <- loan %>%
mutate(duration = as_factor(duration),
status = as_factor(status),
date_issued = as.Date(date_issued)
)
clean/organize order
#factorize variables
order <- order %>%
mutate(bank_to = as_factor(bank_to),
k_symbol = as_factor(k_symbol)
)
order <- order %>%
mutate(k_symbol = case_when(
k_symbol == 'POJISTNE' ~ 'issurance payment',
k_symbol == 'SIPO' ~ 'household',
k_symbol == 'LEASING' ~ 'leasing',
k_symbol == 'UVER' ~ 'loan payment'
)
)
#str(order)
clean/organize trans
#reformat date
trans$date_dmy <- format(as.Date(as.character(trans$date), "%y%m%d"), "19%y/%m/%d")
trans <- trans %>%
select(-date)
#factorize variables
trans <- trans %>%
mutate(type = as_factor(type),
operation = as_factor(operation),
k_symbol = as_factor(k_symbol),
date_dmy = as.Date(date_dmy)
)
trans <- trans %>%
mutate(type = case_when(
type == 'PRIJEM' ~ 'credit',
type == 'VYDAJ' ~ 'withdrawal',
type == 'VYBER' ~ 'withdrawal in cash'
)) %>%
mutate(operation = case_when(
operation == 'VYBER KARTOU' ~ 'credit card withdrawal',
operation == 'VKLAD' ~ 'credit in cash',
operation == 'PREVOD Z UCTU' ~ 'collection from other bank',
operation == 'VYBER' ~ 'withdrawal in cash',
operation == 'PREVOD NA UCET' ~ 'remittance to other bank'
)) %>%
mutate(k_symbol = case_when(
k_symbol == 'POJISTNE' ~ 'insurrance payment',
k_symbol == 'SLUZBY' ~ 'payment for statement',
k_symbol == 'UROK' ~ 'interest credited',
k_symbol == 'SIPO' ~ 'hosehold',
k_symbol == 'DUCHOD' ~ 'old-age pension',
k_symbol == 'UVER' ~ 'loan payment'
))
Bring together client and disp
client_disp <- full_join(
disp,client, by = 'client_id'
)
client_disp <- full_join(
client_disp, card, by = 'disp_id'
)
client_disp <- client_disp %>%
rename(
type_client = type.x,
type_card = type.y,
card_issued = issued_date
) %>%
select(
- birth_number,
- month,
- issued
)
#gender count aller clients
gender_group <- client_disp %>%
count(gender)
gender_group
#type count aller clients
gen_client_group <- client_disp %>%
count(type_client)
gen_client_group
#gender count nach type
gen_client_test<- client_disp %>%
group_by(gender) %>%
count(type_client)
gen_client_test
#herausfinden ob 1 client mehrere accounts
# gender spread czeck republik 1998 female 51.37% https://data.worldbank.org/indicator/SP.POP.TOTL.FE.ZS?end=1998&locations=CZ&start=1998
Fragestellung Recherche Jenni
Quick overview research to get to know the outlines of the dataset and decide where to go deeper.
First i wanted to see the genderdistribution of the clients.
As we can see in the Plot below, there are slightly less women than man, but over all the gender is pretty balanced out.
client_gender_j <-ggplot(
client, aes(x = gender, fill = gender))+
geom_bar()+
ggtitle('\nClients spread on gender\n')+
theme(
plot.title = element_text(hjust = 0.5)
)+
labs( y= '\nNumber of Clients\n', x = '\nGender\n')
client_gender_j
client_owner_j <-ggplot(
client_disp, aes(x = type_client,fill = gender ))+
geom_bar(position ='dodge')+
ggtitle('\nOwner/Disponent spread on gender\n')+
theme(
plot.title = element_text(hjust = 0.5)
)+
geom_text(aes(label = ..count..), stat = "count", position = position_dodge(width = 0.9), vjust = 2)+
labs( y= '\nNumber of Clients\n', x = '\nGender\n')
client_owner_j
#weiter untersuchen, wer macht die Überweisungen und altersgruppen
#check mit count
print(count(client_disp))
owner_total <- sum(client_disp$type_client== 'OWNER')
cat('Total OWNER:', owner_total)
Total OWNER: 4500
dispo_total <- sum(client_disp$ype_client== 'DISPONENT')
cat('Total DISPONENT: ', dispo_total)
Total DISPONENT: 0
female_client <- sum(client_disp$gender == 'f')
cat('Total female clients: ',female_client)
Total female clients: 2645
male_client <- sum(client_disp$gender == 'm')
cat('Total male clients: ',female_client)
Total male clients: 2645
print('There is an equal amount of male and female clients. That is very surprising. We will have a deeper look at this.')
[1] "There is an equal amount of male and female clients. That is very surprising. We will have a deeper look at this."
# da stimmt etwas nicht. Bei clients sagts es sind 5369 observationen, wenn mann nach gender odrnet sind immer zu viele das erste geschlecht.????
The general information of the Age of the clients is a good base information as well.
We gan see in the plot below most clients are between 20 and 55 years old.
client_age_j <- ggplot(
client, aes(x = age, colour = gender))+
#geom_bar(alpha = 0.5)+
geom_density()+
ggtitle('\nClientspread on age, separated by gender\n')+
theme(
plot.title = element_text(hjust = 0.5)
)+
labs( y = '\ndensity\n', x = '\nAge\n')
client_age_j
age_count <- client %>%
group_by(gender, age) %>%
summarise(
count = n()
)
`summarise()` has grouped output by 'gender'. You can override using the `.groups` argument.
age_count
client_age_jl <- ggplot(
age_count, aes(x = age, colour = gender, y = count))+
#geom_bar(alpha = 0.5)+
# geom_point()+
geom_line()+
ggtitle('\nClientspread on age, separated by gender\n')+
theme(
plot.title = element_text(hjust = 0.5)
)
client_age_jl
NA
NA
card_type_j <- ggplot(
card, aes(x = type, fill = type))+
geom_bar()+
ggtitle('\nSpread of card types\n')+
geom_text(aes(label = ..count..), stat = "count", position = position_dodge(width = 0.9), vjust = 2)+
theme(
plot.title = element_text(hjust = 0.5)
)
card_type_j
card_type_jx <- ggplot(
client_disp, aes(x = type_client, fill = gender))+
geom_bar(position = 'dodge')+
ggtitle('\nSpread of card types\n')+
geom_text(aes(label = ..count..), stat = "count", position = position_dodge(width = 0.9), vjust = 2)+
theme(
plot.title = element_text(hjust = 0.5)
)
card_type_jx
card_type_g_j <- ggplot(
client_disp, aes(x = type_card, fill = gender))+
geom_bar(position = 'dodge')+
ggtitle('\nSpread of card types\n')+
geom_text(aes(label = ..count..), stat = "count", position = position_dodge(width = 0.9), vjust = 2)+
theme(
plot.title = element_text(hjust = 0.5)
)
card_type_g_j
card_type_age_g <- ggplot(
client_disp, aes(x = age, colour = type_card))+
geom_density(position = 'dodge')+
ggtitle('\nSpread of card types\n')+
theme(
plot.title = element_text(hjust = 0.5)
)
card_type_age_g
Warning: Width not defined. Set with `position_dodge(width = ?)`
age_card <- client_disp %>%
group_by( type_card, age ) %>%
summarise(
count = n()
)
`summarise()` has grouped output by 'type_card'. You can override using the `.groups` argument.
age_card
card_type_age <- ggplot(
age_card, aes(x = age, y = count, colour = type_card))+
geom_line(position = 'dodge')+
ggtitle('\nSpread of card types\n')+
theme(
plot.title = element_text(hjust = 0.5)
)
card_type_age
Warning: Width not defined. Set with `position_dodge(width = ?)`
age_card_m <- filter(client_disp, gender == 'm')
age_card_m
age_card_f <- filter(client_disp, gender == 'f')
age_card_f
age_card_m <- age_card_m %>%
group_by(age, type_card) %>%
summarise(
count = n()
)
`summarise()` has grouped output by 'age'. You can override using the `.groups` argument.
age_card_f <- age_card_f %>%
group_by(age, type_card) %>%
summarise(
count = n()
)
`summarise()` has grouped output by 'age'. You can override using the `.groups` argument.
card_type_age_m <- ggplot(
age_card_m, aes(x = age, y = count, colour = type_card))+
geom_line(position = 'dodge')+
ggtitle('\nSpread of card types of male clients\n')+
theme(
plot.title = element_text(hjust = 0.5)
)
card_type_age_m
Warning: Width not defined. Set with `position_dodge(width = ?)`
card_type_age_f <- ggplot(
age_card_f, aes(x = age, y = count, colour = type_card))+
geom_line(position = 'dodge')+
ggtitle('\nSpread of card types of female clients\n')+
theme(
plot.title = element_text(hjust = 0.5)
)
card_type_age_f
Warning: Width not defined. Set with `position_dodge(width = ?)`
<<<<<<< HEAD
# filter by card, make dataframe for each card type
#wenn junior_card is true pipe into new data frame
client_junior <- filter(client_disp , type_card == 'junior')
client_classic <- filter(client_disp , type_card == 'classic')
client_gold <- filter(client_disp , type_card == 'gold')
client_all_cards <- client_disp %>%
filter(!is.na(type_card))
client_na <- client_disp %>%
filter(is.na(type_card))
client_na
client_na_junior <- filter(client_na, age <= 23)
client_na_classic_gold <- filter(client_na, age > 23)
# client_junior$age_card_issued2 <- client_junior$card_issued - client_junior$birthday
client_junior$age_card_issued <- floor(age_calc(as.Date(client_junior$birthday),client_junior$card_issued, units = "years"))
junior_issued_age_viz <- ggplot( client_junior, aes( x = age_card_issued, fill = gender))+
geom_bar( position = 'dodge'
)+
# geom_density()+
ggtitle('\nJunior cards issued on age of owner\n')+
theme(
plot.title = element_text(hjust = 0.5)
)
junior_issued_age_viz
client_na_junior_age_viz <- ggplot(client_na_junior, aes(x=age, fill=gender))+
geom_bar(position = 'dodge')+
ggtitle('\npotential junior card clients age spread\n')+
theme(
plot.title = element_text(hjust = 0.5)
)
client_na_junior_age_viz
client_classic$age_card_issued <- floor(age_calc(as.Date(client_classic$birthday),client_classic$card_issued, units = 'years'))
client_gold$age_card_issued <- floor(age_calc(as.Date(client_gold$birthday),client_gold$card_issued, units = 'years'))
classic_issued_age_viz <- ggplot(client_classic, aes( x = age, fill = gender))+
geom_bar(position = 'dodge')+
ggtitle('\nClassic cards issued on age of owner\n')+
theme(
plot.title = element_text(hjust = 0.5)
)
classic_issued_age_viz
#nehmen wir alle ab 20 zu der gruppe potential classic????????
gold_issued_age_viz <- ggplot(client_gold, aes( x = age, fill = gender))+
geom_bar(position = 'dodge')+
ggtitle('\nGold cards issued on age of owner\n')+
theme(
plot.title = element_text(hjust = 0.5)
)
gold_issued_age_viz
#join card information to traansactions via account id
cardowner_transactions <- inner_join(client_all_cards, trans, by = 'account_id') %>%
mutate(trans_credit =
case_when(
type == 'credit' ~ 1,
type != 'credit' ~ 0
)
) %>%
mutate(trans_withdrawal =
case_when(
type == 'withdrawal' ~ 1,
type != 'withdrawal' ~ 0
)
) %>%
mutate(
type = as.factor(type),
operation = as.factor(operation),
k_symbol = as.factor(k_symbol),
bank = as.factor(bank)
)
junior_card_transactions <- filter(cardowner_transactions, type_card == 'junior')
classic_card_transactions <- filter(cardowner_transactions, type_card == 'classic')
gold_card_transactions <- filter(cardowner_transactions, type_card == 'gold')
junior_card_transactions_2 <- junior_card_transactions %>%
group_by(account_id, type) %>%
summarise(
count = n()
)
`summarise()` has grouped output by 'account_id'. You can override using the `.groups` argument.
junior_card_transactions_2
NA
NA
NA
#zuerst durchschnitt pro gender berechnen und dann mean pro gender angeben
junior_type_sum <- junior_card_transactions$type %>%
summary()
classic_type_sum <- classic_card_transactions$type %>%
summary()
gold_type_sum <- gold_card_transactions$type %>%
summary()
print('junior_type')
[1] "junior_type"
junior_type_sum
credit withdrawal withdrawal in cash
14681 20849 809
print('classic_type')
[1] "classic_type"
classic_type_sum
credit withdrawal withdrawal in cash
58985 96724 4150
print('gold_type')
[1] "gold_type"
gold_type_sum
credit withdrawal withdrawal in cash
9333 15803 604
#Na untersuchen juniorcard
junior_card_transactions_na <- filter(junior_card_transactions, is.na(type))
junior_card_transactions_na_sum <- junior_card_transactions_na$operation %>%
summary()
junior_card_transactions_na_sum
collection from other bank credit card withdrawal
0 0
credit in cash remittance to other bank
0 0
withdrawal in cash
0
#na untersuchen classic card
classic_card_transactions_na <- filter(classic_card_transactions, is.na(type))
classic_card_transactions_na_sum <- classic_card_transactions_na$operation %>%
summary()
classic_card_transactions_na_sum
collection from other bank credit card withdrawal
0 0
credit in cash remittance to other bank
0 0
withdrawal in cash
0
#na untersuchen gold card
gold_card_transactions_na <- filter(gold_card_transactions, is.na(type))
client_gender <-ggplot(
client, aes(x = gender, fill = gender))+
geom_bar()+
ggtitle('\nClients spread on gender\n')+
theme(
plot.title = element_text(hjust = 0.5)
)+
labs( y= '\nNumber of Clients\n', x = '\nGender\n')
client_gender
The general information of the Age of the clients is a good base information as well.
client_age <- ggplot(
client, aes(x = age,colour = gender))+
# geom_bar(alpha = 0.5)+
geom_density()+
ggtitle('\nClientspread on age, separated by gender\n')+
theme(
plot.title = element_text(hjust = 0.5)
)+
labs( y= '\ndensity\n', x = '\nAge\n')
client_age
card_type <- ggplot(
card, aes(x = type, fill = type))+
geom_bar()+
ggtitle('\nSpread of card types\n')+
theme(
plot.title = element_text(hjust = 0.5)
)
card_type
gold_card_transactions_na_sum <- gold_card_transactions_na$operation %>%
summary()
gold_card_transactions_na_sum
collection from other bank credit card withdrawal
0 0
credit in cash remittance to other bank
0 0
withdrawal in cash
0
junior_trans_viz <- ggplot(junior_card_transactions, aes( x = type, fill = gender))+
geom_bar(position = 'dodge')+
ggtitle('\nCredit compared to withdrawals Junior Card\n')+
theme(
plot.title = element_text(hjust = 0.5)
)
junior_trans_viz
classic_trans_viz <- ggplot(classic_card_transactions, aes( x = type, fill = gender))+
geom_bar(position = 'dodge')+
ggtitle('\nCredit compared to withdrawals classic card\n')+
theme(
plot.title = element_text(hjust = 0.5)
)
classic_trans_viz
gold_trans_viz <- ggplot(gold_card_transactions, aes( x = type, fill = gender))+
geom_bar(position = 'dodge')+
ggtitle('\nCredit compared to withdrawals gold card\n')+
theme(
plot.title = element_text(hjust = 0.5)
)
gold_trans_viz
df_client_disp = client %>% full_join(disp, by ="client_id") %>%
select(-district_id, -birth_number, -birthday)
df_account_client_disp = account %>% full_join(df_client_disp, by = "account_id")
str(df_account_client_disp)
'data.frame': 5369 obs. of 10 variables:
$ account_id : int 576 576 3818 3818 704 704 2378 2632 1972 1539 ...
$ district_id: int 55 55 74 74 55 55 16 24 77 1 ...
$ frequency : chr "monthly issuance" "monthly issuance" "monthly issuance" "monthly issuance" ...
$ date_ymd : Date, format: "1993-01-01" "1993-01-01" ...
$ client_id : int 692 693 4601 4602 844 845 2873 3177 2397 1866 ...
$ month : int 51 3 4 54 1 51 53 NA 7 56 ...
$ gender : Factor w/ 2 levels "f","m": 1 2 2 1 2 1 1 2 2 1 ...
$ age : num 61 62 62 63 52 44 22 59 79 55 ...
$ disp_id : int 692 693 4601 4602 844 845 2873 3177 2397 1866 ...
$ type : Factor w/ 2 levels "OWNER","DISPONENT": 1 2 1 2 1 2 1 1 1 1 ...
summary(df_account_client_disp)
account_id district_id frequency date_ymd
Min. : 1 Min. : 1.0 Length:5369 Min. :1993-01-01
1st Qu.: 1178 1st Qu.:14.0 Class :character 1st Qu.:1993-12-19
Median : 2349 Median :38.0 Mode :character Median :1996-01-03
Mean : 2767 Mean :37.3 Mean :1995-08-05
3rd Qu.: 3526 3rd Qu.:60.0 3rd Qu.:1996-11-03
Max. :11382 Max. :77.0 Max. :1997-12-29
client_id month gender age disp_id
Min. : 1 Min. : 1.00 f:2645 Min. :10.0 Min. : 1
1st Qu.: 1418 1st Qu.: 6.00 m:2724 1st Qu.:29.0 1st Qu.: 1418
Median : 2839 Median :51.00 Median :43.0 Median : 2839
Mean : 3359 Mean :33.18 Mean :43.8 Mean : 3337
3rd Qu.: 4257 3rd Qu.:57.00 3rd Qu.:57.0 3rd Qu.: 4257
Max. :13998 Max. :62.00 Max. :86.0 Max. :13690
NA's :441
type
OWNER :4500
DISPONENT: 869
nrow(df_account_client_disp)
[1] 5369
#density plot, women and men separated
df_account_client_disp %>% ggplot(aes(x = age, color = gender)) +
geom_density(size = 1) + xlab('Age of clients')
#turning points in age
#in general
nr1 <- df_account_client_disp %>%
filter(age == 16) %>%
nrow()
nr1
[1] 30
nr2 <- df_account_client_disp %>%
filter(age == 17) %>%
nrow()
nr2
[1] 94
nr3 <- df_account_client_disp %>%
filter(age == 58) %>%
nrow()
nr3
[1] 104
nr4 <- df_account_client_disp %>%
filter(age == 59) %>%
nrow()
nr4
[1] 71
nr5 <- df_account_client_disp %>%
filter(age == 79) %>%
nrow()
nr5
[1] 46
nr6 <- df_account_client_disp %>%
filter(age == 80) %>%
nrow()
nr6
[1] 5
#turning points in age
#for women
df_account_client_disp_w <- df_account_client_disp %>%
filter(gender == 'f')
nrw1 <- df_account_client_disp_w %>%
filter(age == 16) %>%
nrow()
nrw1
[1] 12
nrw2 <- df_account_client_disp_w %>%
filter(age == 17) %>%
nrow()
nrw2
[1] 49
nrw3 <- df_account_client_disp_w %>%
filter(age == 58) %>%
nrow()
nrw3
[1] 54
nrw4 <- df_account_client_disp_w %>%
filter(age == 59) %>%
nrow()
nrw4
[1] 26
nrw5 <- df_account_client_disp_w %>%
filter(age == 79) %>%
nrow()
nrw5
[1] 22
nrw6 <- df_account_client_disp_w %>%
filter(age == 80) %>%
nrow()
nrw6
[1] 2
#for men
df_account_client_disp_m <- df_account_client_disp %>%
filter(gender == 'm')
nrm1 <- df_account_client_disp_m %>%
filter(age == 16) %>%
nrow()
nrm1
[1] 18
nrm2 <- df_account_client_disp_m %>%
filter(age == 17) %>%
nrow()
nrm2
[1] 45
nrm3 <- df_account_client_disp_m %>%
filter(age == 62) %>%
nrow()
nrm3
[1] 53
nrm4 <- df_account_client_disp_m %>%
filter(age == 63) %>%
nrow()
nrm4
[1] 27
nrm5 <- df_account_client_disp_m %>%
filter(age == 79) %>%
nrow()
nrm5
[1] 24
nrm6 <- df_account_client_disp_m %>%
filter(age == 80) %>%
nrow()
nrm6
[1] 3
#bar chart, one bar for each age
df_account_client_disp %>% ggplot(aes(x = age)) + geom_bar() + scale_fill_brewer(palette = "Paired")
#bar chart, women and men separated, one bar for each age
df_account_client_disp %>% ggplot(aes(x = age, fill = gender)) + geom_bar(position = 'dodge') +
scale_fill_brewer(palette = "Paired") + xlab('Age of clients')
#stacked bar chart, women and men separated, one bar for each age
df_account_client_disp %>% ggplot(aes(x = age, fill = gender)) + geom_bar() +
scale_fill_brewer(palette = "Paired") + xlab('Age of clients')
#two bar charts, women and men separated, each age one bar
df_account_client_disp %>% ggplot(aes(x = age, fill = gender)) + geom_bar() +
scale_fill_brewer(palette = "Paired") + facet_wrap(~ gender) + xlab('Age of clients')
#coordination flipped bar chart, omen and men separated, one bar for each age
df_account_client_disp %>% ggplot(aes(x = age, fill = gender)) + geom_bar() + coord_flip() +
scale_fill_brewer(palette = "Paired") + xlab('Age of clients')
#coordination flipped bar chart, omen and men separated, one bar for each age, in percentage
df_account_client_disp %>% ggplot(aes(x = age, fill = gender)) + geom_bar(position = 'fill') +
coord_flip() + scale_fill_brewer(palette = "Paired") + xlab('Age of clients')
#order the clients by youngest and oldest, youngest is 10, oldest is 86
#group all the clients in 7 different groups
df_account_client_disp <- df_account_client_disp %>%
mutate(age_grouped = case_when(
age <= 16 ~ '1-16',
age <= 32 ~ '17-32',
age <= 48 ~ '33-48',
age <= 64 ~ '49-64',
age <= 72 ~ '65-72',
age <= 86 ~ '73-86',
TRUE ~ 'not known'
))
#factor the groups
df_account_client_disp <- df_account_client_disp %>%
mutate(age_grouped = as_factor(age_grouped))
#order the new age column in a determined way
df_account_client_disp$age3 <-
ordered(df_account_client_disp$age_grouped, levels = c('1-16', '17-32', '33-48', '49-64', '65-72',
'73-86'))
#bar chart, all clients in 7 groups
df_account_client_disp %>% ggplot(aes(x = factor(age_grouped))) + geom_bar() +
scale_fill_brewer(palette = "Paired") + xlab('Age of clients')
#two bar charts, women and men separated, 7 groups
df_account_client_disp %>% ggplot(aes(x = age_grouped, fill = gender)) + geom_bar() +
scale_fill_brewer(palette = "Paired") + facet_wrap(~ gender)+ xlab('Age of clients')
#bar chart, women and men separated, 7 groups
df_account_client_disp %>% ggplot(aes(x = age_grouped, fill = gender)) + geom_bar(position = 'dodge') +
scale_fill_brewer(palette = "Paired") + xlab('Age of clients')
#stacked bar chart, women and men separated, 7 groups
df_account_client_disp %>% ggplot(aes(x = age_grouped, fill = gender)) +
geom_bar() + scale_fill_brewer(palette = "Paired") + xlab('Age of clients')
#reorder the age3 column to ascending
df_account_client_disp <- df_account_client_disp %>%
mutate(age3 = fct_reorder(age_grouped, desc(age_grouped)))
#coordination flipped bar chart, women and men separated, 7 groups
df_account_client_disp %>% ggplot(aes(x = age_grouped, fill = gender)) +
geom_bar() + scale_fill_brewer(palette = "Paired") +
coord_flip() + xlab('Age of clients')
#coordination flipped bar chart, women and men separated, 7 groups, in percentage
df_account_client_disp %>% ggplot(aes(x = age_grouped, fill = gender)) +
geom_bar(position = 'fill') + scale_fill_brewer(palette = "Paired") + coord_flip() +
xlab('Age of clients') + ylab('Density')